第4章 多変数をとらえるデータ可視化

図の右上のshowボタンを押すとRのコードが表示されます。

4.1.1

Rでレーダーチャートを描くパッケージはいくつかあるのだが、どれもイマイチ納得いかなかったのでポーラーチャートで勘弁して下さい。

library(conflicted)
library(tidyverse)
library(patchwork)

data1 <- data.frame(
  person = c("Aさん", "Bさん"), 
  数学 = c(5, 2),
  国語 = c(5, 3),
  理科 = c(5, 4),
  社会 = c(4, 3),
  体育 = c(4, 4),
  音楽 = c(5, 2)
  ) |>
  pivot_longer(!person)

# compelling data
data2 <- data.frame(
  person = c('Aさん', 'Bさん'),
  数学 = c(5, 5),
  国語 = c(5, 1),
  理科 = c(1, 5),
  社会 = c(1, 1),
  体育 = c(1, 5),
  音楽 = c(5, 1)
) |>
  pivot_longer(!person)

# 並び1
p1 <- data1 |>
  mutate(name = factor(name, levels = c("数学", "国語", "理科", "社会", "体育", "音楽"))) |>
  ggplot(aes(x = name, y = value, fill = person, group = person)) +
  geom_col(position = position_dodge(width=1)) +
  coord_polar() +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank()
    )

# 並び2
p2 <- data1 |>
  mutate(name = factor(name, levels = c("音楽", "数学", "理科", "体育", "社会", "国語"))) |>
  ggplot(aes(x = name, y = value, fill = person, group = person)) +
  geom_col(position = position_dodge(width=1)) +
  coord_polar() +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank()
    )

# 並び1
p3 <- data2 |>
  mutate(name = factor(name, levels = c("数学", "国語", "理科", "社会", "体育", "音楽"))) |>
  ggplot(aes(x = name, y = value, fill = person, group = person)) +
  geom_col(position = position_dodge(width=1)) +
  coord_polar() +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank()
    )

# 並び2
p4 <- data2 |>
  mutate(name = factor(name, levels = c("音楽", "数学", "理科", "体育", "社会", "国語"))) |>
  ggplot(aes(x = name, y = value, fill = person, group = person)) +
  geom_col(position = position_dodge(width=1)) +
  coord_polar() +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank()
    )

(p1 + p2) / (p3 + p4)

4.1.2

書籍には無いが、品種ごとに分けた可視化もおまけで付けました。

library(conflicted)
library(tidyverse)
library(patchwork)

v_names <- c(
    "ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分", 
    "ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
    "非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
    "OD280/OD315値", "プロリン"
  )

df <- read_csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
  col_names = v_names,
  col_types = "f"
  ) |>
  mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
  rowid_to_column() |>
  pivot_longer(!c(ブドウの品種, rowid)) |>
  mutate(name = factor(name, levels = v_names))

df |>
  ggplot(aes(x = name, y = value, colour = ブドウの品種, group =rowid)) +
  geom_line(alpha = 0.5) +
  coord_flip() +
  labs(title = "ワインの特徴とブドウの品種", y = "相対スコア", x = "")

df |>
  ggplot(aes(x = name, y = value, colour = ブドウの品種, group =rowid)) +
  geom_line(alpha = 0.5) +
  coord_flip() +
  facet_wrap(vars(ブドウの品種)) +
  labs(title = "ワインの特徴とブドウの品種", y = "相対スコア", x = "")

4.1.3

library(conflicted)
library(tidyverse)

v_names <- c(
    "ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分", 
    "ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
    "非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
    "OD280/OD315値", "プロリン"
  )

df <- read_csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
  col_names = v_names,
  col_types = "f"
) |>
  mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
  rowid_to_column() |>
  pivot_longer(!c(ブドウの品種, rowid)) |>
  mutate(name = factor(name, levels = v_names))

p1 <- df |>
  ggplot(aes(x = name, y = value, group = ブドウの品種, fill = ブドウの品種)) +
  stat_summary(geom = "bar", fun = "mean", position = "dodge2") +
  stat_summary(geom = "errorbar", fun.data = "mean_se", position = "dodge2") +
  labs(y = "相対スコア", x = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- df |>
  ggplot(aes(x = name, y = value, fill = ブドウの品種)) +
  geom_violin(position="dodge") +
  labs(y = "相対スコア", x = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

p1 / p2

4.1.4

library(conflicted)
library(tidyverse)

v_names <- c(
    "ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分", 
    "ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
    "非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
    "OD280/OD315値", "プロリン"
  )

df <- read_csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
  col_names = v_names,
  col_types = "f"
  ) |>
  mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
  rowid_to_column() |>
  pivot_longer(!c(rowid, ブドウの品種)) |>
  mutate(name = factor(name, levels = rev(v_names)))

jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))

df |>
  ggplot(aes(x = rowid, y = name, fill = value)) +
  geom_tile() +
  facet_wrap(vars(ブドウの品種), scales = "free_x") + 
  scale_fill_gradientn(colours = jet.colors(100)) +
  labs(x = "ワイン銘柄番号", y = "", title = "カラーコードで値を表現する")

4.1.5

library(conflicted)
library(tidyverse)

df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/4%E7%AB%A0/data/behavior_data.csv"
  ) |>
  mutate(Time = Time / 3600)  |> # Time列を秒から時間に変換
  pivot_longer(!Time) |>
  mutate(
    value = case_when(
      value == "Garbage" ~ "ゴミ捨て場",
      value == "Nest" ~ "寝室",
      value == "Other" ~ "一般の部屋",
      value == "Toilet" ~ "トイレ"
      ) |>
      factor(levels = c("ゴミ捨て場", "トイレ", "寝室", "一般の部屋"))
    ) 

df |>
  ggplot(aes(x =name , y = Time, fill = value)) +
  geom_tile() +
  scale_y_reverse() +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "ヒートマップによる行動時系列の可視化", x = "個体", y = "時間[h]")

4.1.6

library(conflicted)
library(tidyverse)
library(patchwork)

# CSVデータを読み込む
df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/4%E7%AB%A0/data/behavior_data.csv"
) |>
  mutate(Time = Time / 3600)  |> # Time列を秒から時間に変換
  pivot_longer(!Time) |>
  mutate(
    value = case_when(
      value == "Garbage" ~ "ゴミ捨て場",
      value == "Nest" ~ "寝室",
      value == "Other" ~ "一般の部屋",
      value == "Toilet" ~ "トイレ"
    ) |>
      factor(levels = rev(c("ゴミ捨て場", "トイレ", "寝室", "一般の部屋")))
  )

# 1個体(J)のデータ
df_j <- df |>
  dplyr::filter(name == "J") |> 
  dplyr::select(!name) |>
  mutate(action = "1")

p1 <- df_j |>
  ggplot(aes(x = value, y = Time, fill = value)) +
  geom_tile() +
  scale_y_reverse() +
  labs(x = "", y = "時間[h]") +
  scale_fill_brewer(palette = "Set1") +
  theme(
    legend.position="none",
    aspect.ratio = 4 / 1.5
    )

p2 <- df_j |>
  dplyr::filter(Time >= 13, Time <= 16) |>
  ggplot(aes(x = value, y = Time, fill = value)) +
  geom_tile() +
  scale_y_reverse() +
  labs(x = "", y = "時間[h]") +
  scale_fill_brewer(palette = "Set1") +
  theme(
    legend.position="none",
    aspect.ratio = 4 / 1.5
  )

p1 + p2

4.2.1

library(conflicted)
library(tidyverse) 
library(patchwork)

# 共著データ
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
collaboration <- matrix(
  c(
    NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
    1, NA, 0, 1, 1, 1, 0, 0, 0, 1,
    1, 0, NA, 0, 0, 0, 1, 1, 1, 0,
    0, 1, 0, NA, 0, 0, 0, 0, 0, 0,
    0, 1, 0, 0, NA, 0, 0, 0, 0, 0,
    0, 1, 0, 0, 0, NA, 0, 0, 0, 0,
    0, 0, 1, 0, 0, 0, NA, 0, 0, 0,
    0, 0, 1, 0, 0, 0, 0, NA, 0, 0,
    0, 0, 1, 0, 0, 0, 0, 0, NA, 0,
    1, 1, 0, 0, 0, 0, 0, 0, 0, NA
    ),
  nrow=10,
  dimnames = list(researchers, researchers)
  )

p1 <- collaboration |>
  as.data.frame() |>
  rownames_to_column() |>
  pivot_longer(!rowname) |>
  mutate(
    value = factor(value),
    name = factor(name, levels = rev(LETTERS[1:10]))
    ) |>
  ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
  geom_tile() +
  geom_text() +
  scale_fill_hue(name = "", labels = c("0" = "共著なし", "1" ="共著あり")) +
  labs(title = "共著関係の有無", x = "", y = "") +
  theme(aspect.ratio = 1)

# ここはもうちょっとスマートに書けないか?
set.seed(0)
v0 <- runif(n = (100 -10)/2, min = 0.0, max = 0.5) |> round(2)
v1 <- runif(n = (100 -10)/2, min = 0.5, max = 1.0) |> round(2)
collaboration_score <- collaboration
collaboration_score[upper.tri(collaboration_score)] <-
  if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0) 
collaboration_score <- t(collaboration_score)
collaboration_score[upper.tri(collaboration_score)] <-
  if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)

jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))

p2 <- collaboration_score |>
  as.data.frame() |>
  rownames_to_column() |>
  pivot_longer(!rowname) |>
  mutate(name = factor(name, levels = rev(LETTERS[1:10]))) |>
  ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
  geom_tile() +
  geom_text(size = 2) +
  labs(title = "研究の興味の類似度", x = "", y = "") +
  theme(aspect.ratio = 1) +
  scale_fill_gradientn(colours = jet.colors(100))

p1 + p2

4.2.2

library(conflicted)
library(tidyverse)
library(igraph)

# 共著データ
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
collaboration <- matrix(
  c(
    NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
    1, NA, 0, 1, 1, 1, 0, 0, 0, 1,
    1, 0, NA, 0, 0, 0, 1, 1, 1, 0,
    0, 1, 0, NA, 0, 0, 0, 0, 0, 0,
    0, 1, 0, 0, NA, 0, 0, 0, 0, 0,
    0, 1, 0, 0, 0, NA, 0, 0, 0, 0,
    0, 0, 1, 0, 0, 0, NA, 0, 0, 0,
    0, 0, 1, 0, 0, 0, 0, NA, 0, 0,
    0, 0, 1, 0, 0, 0, 0, 0, NA, 0,
    1, 1, 0, 0, 0, 0, 0, 0, 0, NA
  ),
  nrow=10,
  dimnames = list(researchers, researchers)
)

# 研究の興味の類似度
set.seed(0)
v0 <- runif(n = (100 -10)/2, min = 0.0, max = 0.5) |> round(2)
v1 <- runif(n = (100 -10)/2, min = 0.5, max = 1.0) |> round(2)
collaboration_score <- collaboration
collaboration_score[upper.tri(collaboration_score)] <-
  if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0) 
collaboration_score <-
  t(collaboration_score)
collaboration_score[upper.tri(collaboration_score)] <-
  if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)

# 共著ネットワーク
diag(collaboration) <- 0
collabo_g <- collaboration |>
  graph_from_adjacency_matrix(mode = "undirected")

set.seed(0)

par(mfrow = c(2, 2))

collabo_g |>
  plot(layout = layout_in_circle)

collabo_g |>
  plot()

# 類似度ネットワーク
diag(collaboration_score) <- 0
score_g <- collaboration_score |>
  graph_from_adjacency_matrix(mode = "undirected", weighted = TRUE)

jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
col_p <- round((E(score_g)$weight / max(E(score_g)$weight)) * 100, 0)

set.seed(0)

collabo_g |>
  plot(
    layout = layout_in_circle,
    edge.width = E(score_g)$weight * 10,
    edge.color = jet.colors(100)[col_p]
    )

score_g |>
  plot(
    edge.width = E(score_g)$weight * 10,
    edge.color = jet.colors(100)[col_p]
    )

par(mfrow = c(1, 1))

4.2.3

4.2.3.1
library(conflicted)
library(tidyverse)

# 研究者リスト
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")

# 共著データ
collaboration <- matrix(
  c(
    NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
    0, NA, 0, 1, 1, 1, 0, 0, 0, 0,
    0, 0, NA, 0, 0, 0, 1, 1, 1, 0,
    0, 0, 0, NA, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, NA, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, NA, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, NA, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, NA, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, NA, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, NA
    ),
  nrow = 10,
  dimnames = list(researchers, researchers)
  )

# 共著ネットワーク
collaboration |>
  as.data.frame() |>
  rownames_to_column() |>
  pivot_longer(!rowname) |>
  mutate(
    value = factor(value),
    name = factor(name, levels = rev(LETTERS[1:10]))
    ) |>
  ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
  geom_tile() +
  geom_text() +
  scale_fill_hue(name = "", labels = c("0" = "指導関係なし", "1" ="指導関係あり")) +
  labs(title = "隣接行列表示", x = "指導された研究者", y = "指導した研究者") +
  theme(aspect.ratio = 1)

4.2.3.2
library(conflicted)
library(tidyverse)
library(igraph)

# 研究者リスト
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")

# 共著データ
collaboration <- matrix(
  c(
    NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
    0, NA, 0, 1, 1, 1, 0, 0, 0, 0,
    0, 0, NA, 0, 0, 0, 1, 1, 1, 0,
    0, 0, 0, NA, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, NA, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, NA, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, NA, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, NA, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, NA, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, NA
    ),
  nrow = 10,
  dimnames = list(researchers, researchers)
  )

diag(collaboration) <- 0

collaboration |>
  t() |>
  graph_from_adjacency_matrix(mode = "directed") |>
  plot(layout = layout_as_tree)

4.2.4

circoレイアウトがigraphには無いようなのでMDSレイアウトで代用。

library(conflicted)
library(tidyverse)
library(igraph)

set.seed(0)
G_er <- erdos.renyi.game(30, 0.2)
G_ws <- watts.strogatz.game(1, 30, 5, 0.1)
G_ba <- barabasi.game(30, 1)

ErdosR <- as_adjacency_matrix(G_er, sparse = FALSE)
WattsStrogatz <- as_adjacency_matrix(G_ws, sparse = FALSE)
BarabasiAlbert <- as_adjacency_matrix(G_ba, sparse = FALSE)

net1 <- graph_from_adjacency_matrix(ErdosR, mode = "max")
net2 <- graph_from_adjacency_matrix(WattsStrogatz, mode = "max")
net3 <- graph_from_adjacency_matrix(BarabasiAlbert, mode = "max")

par(mfrow = c(3,3))
plot(net1, layout = layout_in_circle,
     main = "Erdos-Reyni: Circular Layout", vertex.size = 9, vertex.label=NA, )
plot(net2, layout = layout_in_circle,
     main = "Watts-Strogatz: Circular Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_in_circle,
     main = "Barabasi-Albert: Circular Layout", vertex.size = 9, vertex.label=NA)

plot(net1, layout = layout_with_mds,
     main = "Erdos-Reyni: MDS Layout", vertex.size = 9, vertex.label=NA)
plot(net2, layout = layout_with_mds,
     main = "Watts-Strogatz: MDS Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_with_mds,
     main = "Barabasi-Albert: MDS Layout", vertex.size = 9, vertex.label=NA)

plot(net1, layout = layout_with_kk(net1),
     main = "Erdos-Reyni: KK Layout", vertex.size = 9, vertex.label=NA)
plot(net2, layout = layout_with_kk(net2),
     main = "Watts-Strogatz: KK Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_with_kk(net3),
     main = "Barabasi-Albert: KK Layout", vertex.size = 9, vertex.label=NA)

par(mfrow = c(1,1))

4.2.5

dotとcircoが無いのでtree, mdsで代用

library(conflicted)
library(tidyverse)
library(igraph)

# ランダム有向グラフを生成
set.seed(0)
G_dir_random <- sample_gnm(30, 81, directed = TRUE) 

# 階層構造を持つ有向グラフを生成
set.seed(0)
G_dir_hierarchy <- sample_tree(30, directed = TRUE)

my_plot <- function(data, layout){
  plot(
    data, layout = layout,
    vertex.color="yellow",
    vertex.label = NA,
    edge.arrow.size = 0.5
    )
}
par(mfrow = c(3,2))
my_plot(G_dir_random, layout_as_tree)
my_plot(G_dir_hierarchy, layout_as_tree)
my_plot(G_dir_random, layout_with_mds)
my_plot(G_dir_hierarchy, layout_with_mds)
my_plot(G_dir_random, layout_with_kk)
my_plot(G_dir_hierarchy, layout_with_kk)

par(mfrow = c(1,1))

4.3.1

library(conflicted)
library(tidyverse)
library(pheatmap)

df <- read_csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
  col_names = c(
    "ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分", 
    "ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
    "非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
    "OD280/OD315値", "プロリン"
  ),
  col_types = "f"
) |>
  mutate(across(where(is.numeric), \(x) scale(x))) #標準化

jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))

pheatmap(
  df |>
    dplyr::select(!ブドウの品種) |>
    t(),
  color = jet.colors(50),
  clustering_method = "ward.D2"
  )

4.3.3

BIRCHだけ見つからないので省略。

ちなみにRではmlbenchパッケージにベンチマーク用の様々なデータセットと人工データ生成用の関数が用意されています。

library(conflicted)
library(tidyverse)
library(ClusterR)   # MiniBatch KMeans
library(apcluster)  # Affinity Propagation
library(meanShiftR) # MeanShift
library(skmeans)    # Spectral Clustering
library(cluster)    # Agglomerative Clustering
library(dbscan)     # DBSCAN, HDBSCAN, OPTICS,
library(mclust)     # Gaussian Mixture
library(patchwork)

# データ用意
my_read_csv <- function(file){
  read_csv(file, col_names = c("x", "y", "class")) |>
    mutate( #標準化しておく
      x = (x - mean(x))/sd(x),
      y = (y - mean(x))/sd(x),  
      class = factor(class + 1))
}

# 円形のクラスタ
noisy_circles <- my_read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/noisy_circles.csv"
  )

# 月型のクラスタ
noisy_moons <- my_read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/noisy_moons.csv"
  )     

# 正規分布に従うクラスタ
blobs <- my_read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/blobs.csv"
  )    

# 異方性のあるデータ
aniso <- my_read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/aniso.csv"
  )       

# 3つの正規分布に従うデータ
varied <- my_read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/varied.csv"
  )               

n <- 500
set.seed(0)
no_structure <- data.frame(x = runif(n), y = runif(n), class = factor("0"))# 構造のないデータ

# 各手法のラッパー関数を用意
# MiniBatch KMeans
my_MiniBatchKmeans <- function(data, cluster_num) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  fit <- MiniBatchKmeans(data_cleaned, clusters = cluster_num)
  pred <- predict(fit, data_cleaned)
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# Affinity Propagation
my_apcluster <- function(data, cluster_num) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- apcluster(s = negDistMat(r=2), x = data_cleaned, p = -200, q = 0.9) |>
    cutree(cluster_num) |>
    labels(type = "enum")
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# MeanShift
my_meanShift <- function(data) {
  data_cleaned <- as.matrix(data[, 1:2])
  start <- Sys.time()
  pred <- meanShift(data_cleaned, data_cleaned)$assignment
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# Spectral Clustering
my_skmeans <- function(data, cluster_num) {
  data_cleaned <- base::as.matrix(data[, 1:2])
  start <- Sys.time()
  pred <- skmeans(data_cleaned, k = cluster_num)$cluster
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# Ward
my_hclust <- function(data, cluster_num) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- data_cleaned |>
    stats::dist() |>
    hclust(method = "ward.D2") |>
    cutree(k = cluster_num)
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# Agglomerative Clustering
my_agnes <- function(data, cluster_num) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- agnes(data_cleaned) |> cutree(k = cluster_num)
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# DBSCAN
my_dbscan <- function(data) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- dbscan::dbscan(data_cleaned, eps = 0.3)$cluster
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# HDBSCAN
my_hdbscan <- function(data) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- dbscan::hdbscan(data_cleaned, minPts = 15)$cluster
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# OPTICS
my_optics <- function(data) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- dbscan::optics(data_cleaned, eps = 0.1, minPts = 7) |>
    extractXi(xi = 0.05) |>
    pluck("cluster")
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

# Gaussian Mixture
my_Mclust <- function(data) {
  data_cleaned <- data[, 1:2]
  start <- Sys.time()
  pred <- mclust::Mclust(data_cleaned, G = 1:3)$classification
  end <- Sys.time()
  diff <- end - start
  data$pred <- as.factor(pred)
  return(list(res = data, diff = diff))
}

### クラスタリング実行
dats <- list(noisy_circles, noisy_moons, varied, aniso, blobs, no_structure)
cnums <- list(2,2,3,3,3,3)

set.seed(0)
res_clustering <- list(
  res_MiniBatchKmeans = map2(dats, cnums, \(dats, cnums) my_MiniBatchKmeans(dats, cnums)),
  res_apcluster       = map2(dats, cnums, \(dats, cnums) my_apcluster(dats, cnums)),
  res_meanShift       = purrr::map(dats, \(dats) my_meanShift(dats)),
  res_skmeans         = map2(dats, cnums, \(dats, cnums) my_skmeans(dats, cnums)),
  res_hclust          = map2(dats, cnums, \(dats, cnums) my_hclust(dats, cnums)),
  res_agnes           = map2(dats, cnums, \(dats, cnums) my_agnes(dats, cnums)),
  res_dbscan          = purrr::map(dats, \(dats) my_dbscan(dats)),
  res_hdbscan         = purrr::map(dats, \(dats) my_hdbscan(dats)),
  res_optics          = purrr::map(dats, \(dats) my_optics(dats)),
  res_Mclust          = purrr::map(dats, \(dats) my_Mclust(dats))
) 

# 描画用ラッパー関数を用意
my_plot <- function(result) {
  result$res |>
    ggplot(aes(x = x, y = y, color = pred)) +
    geom_point(size = 1) + 
    labs(caption = paste0(round(result$diff,3),"s")) +
    theme(
      axis.ticks = element_blank(),  # tickの線を消す
      axis.text = element_blank(),   # tickの数字を消す
      axis.title = element_blank(),  # 軸のラベルを消す
      axis.line = element_blank(),   # 軸の線を消す
      legend.position="none",
      aspect.ratio = 1
      )
}

nums <- expand_grid(d = 1:6, m = 1:10)
res_plot <- map2(nums$m, nums$d, \(.x, .y) my_plot(res_clustering[[.x]][[.y]]))

# ここがダサい……もうちょっとどうにかならないか
res_plot[[1]] + res_plot[[2]] +res_plot[[3]] +res_plot[[4]] +res_plot[[5]] +res_plot[[6]] +
  res_plot[[7]] + res_plot[[8]] +res_plot[[9]] +res_plot[[10]] +res_plot[[11]] +res_plot[[12]] +
  res_plot[[13]] + res_plot[[14]] +res_plot[[15]] +res_plot[[16]] +res_plot[[17]] +res_plot[[18]] +
  res_plot[[19]] + res_plot[[20]] +res_plot[[21]] +res_plot[[22]] +res_plot[[23]] +res_plot[[24]] +
  res_plot[[25]] + res_plot[[26]] +res_plot[[27]] +res_plot[[28]] +res_plot[[29]] +res_plot[[30]] +
  res_plot[[31]] + res_plot[[32]] +res_plot[[33]] +res_plot[[34]] +res_plot[[35]] +res_plot[[36]] +
  res_plot[[37]] + res_plot[[38]] +res_plot[[39]] +res_plot[[40]] +res_plot[[41]] +res_plot[[42]] +
  res_plot[[43]] + res_plot[[44]] +res_plot[[45]] +res_plot[[46]] +res_plot[[47]] +res_plot[[48]] +
  res_plot[[49]] + res_plot[[50]] +res_plot[[51]] +res_plot[[52]] +res_plot[[53]] +res_plot[[54]] +
  res_plot[[55]] + res_plot[[56]] +res_plot[[57]] +res_plot[[58]] +res_plot[[59]] +res_plot[[60]] +
  plot_layout(ncol = 10)

4.3.4

library(conflicted)
library(tidyverse)
library(GGally)

df <- read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/df_434.csv"
  )

df |>
  ggpairs()

4.3.5

library(conflicted)
library(tidyverse)
library(skmeans)
library(GGally)
library(patchwork)

df <- read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/df_434.csv"
  )

# 主成分分析(標準化)
pca_result <- prcomp(df)#, scale. = TRUE)
pca_importance <- as.data.frame(summary(pca_result)$importance[2,]) |>
  rownames_to_column()
names(pca_importance) <- c("pc", "value")

# Spectral Clustering
clusters_pca <- skmeans(pca_result$x[, 1:2], 3)

p1 <- data.frame(
  pc1 = pca_result$x[, 1],
  pc2 = pca_result$x[, 2],
  class = factor(clusters_pca$cluster)
    ) |>
  ggplot(aes(x = pc1, y = pc2, color = class)) +
  geom_point() +
  labs(title = "二つの主成分で見る")+
  theme(legend.position="none", aspect.ratio = 1)

p2 <- pca_importance |>
  ggplot(aes(x = reorder(pc, desc(value)), y = value)) +
  geom_col()+
  labs(title = "各主成分のデータ悦明力", x = "", y = "")+
  theme(aspect.ratio = 1)

p1 + p2

4.3.6

library(conflicted)
library(tidyverse)
library(MASS) # MDS(sammon)
library(Rtsne) # t-SNE
library(umap) # UMAP

df <- read_csv(
  "https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/digits.csv"
  )

df_cleaned <- df |>
  dplyr::select(!target)

# 各手法で2次元に圧縮
# 主成分分析
X_pca <- prcomp(df_cleaned)$x[, 1:2]

# t-SNE
set.seed(42)
X_tsne <- Rtsne(df_cleaned, num_threads = 2)$Y

# MDS
X_mds <- df_cleaned |>
  dist() |>
  sammon(trace = FALSE) |>
  pluck("points")

# UMAP
set.seed(42)
X_umap <- umap(df_cleaned)$layout

# K-means
set.seed(42)
clusters_pca <- kmeans(X_pca, 10)$cluster  # PCA
set.seed(42)
clusters_mds <- kmeans(X_mds, 10)$cluster #MDS
set.seed(42)
clusters_tsne <- kmeans(X_tsne, 10)$cluster # t-SNE
set.seed(42)
clusters_umap <- kmeans(X_umap, 10)$cluster  # UMAP

# 結果をデータフレームにまとめる
df_base <- data.frame(
    x = c(X_pca[, 1], X_mds[, 1], X_tsne[, 1], X_umap[, 1]),
    y = c(X_pca[, 2], X_mds[, 2], X_tsne[, 2], X_umap[, 2])
    )

n <- 1797

dimred <- bind_rows(
  df_base |>
    mutate(
      label = rep(df$target, 4),
      method = c(rep("pca_label", n), rep("mds_label", n),rep("tsne_label", n),rep("umap_label", n))
      ),
  df_base |>
    mutate(
      label = c(clusters_pca, clusters_mds, clusters_tsne, clusters_umap),
      method = c(rep("pca_kmeans", n), rep("mds_kmeans", n),rep("tsne_kmeans", n),rep("umap_kmeans", n))
      )
) |>
  mutate(
    method = factor(
      method,
      levels = c("pca_kmeans", "pca_label", "mds_kmeans", "mds_label",
                 "tsne_kmeans",  "tsne_label", "umap_kmeans", "umap_label")),
    label = factor(label)
    )

# クラスタリング結果と正解ラベルの描画
dimred |>
  ggplot(aes(x = x, y = y, color = label)) + 
  geom_point() + 
  labs(title="様々な次元圧縮方法") +
  theme(
    axis.ticks = element_blank(),  # tickの線を消す
    axis.text = element_blank(),   # tickの数字を消す
    axis.title = element_blank(),  # 軸のラベルを消す
    axis.line = element_blank(),   # 軸の線を消す
    legend.position="none",
    aspect.ratio = 1
  ) +
  facet_wrap(vars(method), nrow = 2, scales = "free")

第4章はここまで。